// SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
// SPDX-FileCopyrightText: Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3

/******************************************************************************
 * Simple demonstration of cub::BlockReduce
 *
 * To compile using the command line:
 *   nvcc -arch=sm_XX example_block_reduce.cu -I../.. -lcudart -O3
 *
 ******************************************************************************/

// Ensure printing of CUDA runtime errors to console (define before including cub.h)
#define CUB_STDERR

#include <cub/block/block_load.cuh>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_store.cuh>

#include <iostream>

#include <stdio.h>

#include "../../test/test_util.h"

using namespace cub;

//---------------------------------------------------------------------
// Globals, constants and aliases
//---------------------------------------------------------------------

/// Verbose output
bool g_verbose = false;

/// Timing iterations
int g_timing_iterations = 100;

/// Default grid size
int g_grid_size = 1;

//---------------------------------------------------------------------
// Kernels
//---------------------------------------------------------------------

/**
 * Simple kernel for performing a block-wide reduction.
 */
template <int BLOCK_THREADS,
          int ITEMS_PER_THREAD,
          BlockReduceAlgorithm ALGORITHM>
__global__ void BlockReduceKernel(int* d_in, // Tile of input
                                  int* d_out, // Tile aggregate
                                  clock_t* d_elapsed) // Elapsed cycle count of block reduction
{
  // Specialize BlockReduce type for our thread block
  using BlockReduceT = BlockReduce<int, BLOCK_THREADS, ALGORITHM>;

  // Shared memory
  __shared__ typename BlockReduceT::TempStorage temp_storage;

  // Per-thread tile data
  int data[ITEMS_PER_THREAD];
  LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in, data);

  // Start cycle timer
  clock_t start = clock();

  // Compute sum
  int aggregate = BlockReduceT(temp_storage).Sum(data);

  // Stop cycle timer
  clock_t stop = clock();

  // Store aggregate and elapsed clocks
  if (threadIdx.x == 0)
  {
    *d_elapsed = (start > stop) ? start - stop : stop - start;
    *d_out     = aggregate;
  }
}

//---------------------------------------------------------------------
// Host utilities
//---------------------------------------------------------------------

/**
 * Initialize reduction problem (and solution).
 * Returns the aggregate
 */
int Initialize(int* h_in, int num_items)
{
  int inclusive = 0;

  for (int i = 0; i < num_items; ++i)
  {
    h_in[i] = i % 17;
    inclusive += h_in[i];
  }

  return inclusive;
}

/**
 * Test thread block reduction
 */
template <int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockReduceAlgorithm ALGORITHM>
void Test()
{
  constexpr int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;

  // Allocate host arrays
  int* h_in  = new int[TILE_SIZE];
  int* h_gpu = new int[TILE_SIZE + 1];

  // Initialize problem and reference output on host
  int h_aggregate = Initialize(h_in, TILE_SIZE);

  // Initialize device arrays
  int* d_in          = nullptr;
  int* d_out         = nullptr;
  clock_t* d_elapsed = nullptr;
  cudaMalloc((void**) &d_in, sizeof(int) * TILE_SIZE);
  cudaMalloc((void**) &d_out, sizeof(int) * 1);
  cudaMalloc((void**) &d_elapsed, sizeof(clock_t));

  // Display input problem data
  if (g_verbose)
  {
    printf("Input data: ");
    for (int i = 0; i < TILE_SIZE; i++)
    {
      printf("%d, ", h_in[i]);
    }
    printf("\n\n");
  }

  // Kernel props
  int max_sm_occupancy;
  CubDebugExit(
    MaxSmOccupancy(max_sm_occupancy, BlockReduceKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));

  // Copy problem to device
  cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);

  printf("BlockReduce algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d "
         "SM occupancy):\n",
         (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : "BLOCK_REDUCE_WARP_REDUCTIONS",
         TILE_SIZE,
         g_timing_iterations,
         g_grid_size,
         BLOCK_THREADS,
         ITEMS_PER_THREAD,
         max_sm_occupancy);

  // Run kernel
  BlockReduceKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(d_in, d_out, d_elapsed);

  // Check total aggregate
  printf("\tAggregate: ");
  int compare = CompareDeviceResults(&h_aggregate, d_out, 1, g_verbose, g_verbose);
  printf("%s\n", compare ? "FAIL" : "PASS");
  AssertEquals(0, compare);

  // Run this several times and average the performance results
  GpuTimer timer;
  float elapsed_millis   = 0.0;
  clock_t elapsed_clocks = 0;

  for (int i = 0; i < g_timing_iterations; ++i)
  {
    // Copy problem to device
    cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);

    timer.Start();

    // Run kernel
    BlockReduceKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>
      <<<g_grid_size, BLOCK_THREADS>>>(d_in, d_out, d_elapsed);

    timer.Stop();
    elapsed_millis += timer.ElapsedMillis();

    // Copy clocks from device
    clock_t clocks;
    CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
    elapsed_clocks += clocks;
  }

  // Check for kernel errors and STDIO from the kernel, if any
  CubDebugExit(cudaPeekAtLastError());
  CubDebugExit(cudaDeviceSynchronize());

  // Display timing results
  float avg_millis          = elapsed_millis / g_timing_iterations;
  float avg_items_per_sec   = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
  float avg_clocks          = float(elapsed_clocks) / g_timing_iterations;
  float avg_clocks_per_item = avg_clocks / TILE_SIZE;

  printf("\tAverage BlockReduce::Sum clocks: %.3f\n", avg_clocks);
  printf("\tAverage BlockReduce::Sum clocks per item: %.3f\n", avg_clocks_per_item);
  printf("\tAverage kernel millis: %.4f\n", avg_millis);
  printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);

  // Cleanup
  if (h_in)
  {
    delete[] h_in;
  }
  if (h_gpu)
  {
    delete[] h_gpu;
  }
  if (d_in)
  {
    cudaFree(d_in);
  }
  if (d_out)
  {
    cudaFree(d_out);
  }
  if (d_elapsed)
  {
    cudaFree(d_elapsed);
  }
}

/**
 * Main
 */
int main(int argc, char** argv)
{
  // Initialize command line
  CommandLineArgs args(argc, argv);
  g_verbose = args.CheckCmdLineFlag("v");
  args.GetCmdLineArgument("i", g_timing_iterations);
  args.GetCmdLineArgument("grid-size", g_grid_size);

  // Print usage
  if (args.CheckCmdLineFlag("help"))
  {
    printf("%s "
           "[--device=<device-id>] "
           "[--i=<timing iterations>] "
           "[--grid-size=<grid size>] "
           "[--v] "
           "\n",
           argv[0]);
    exit(0);
  }

  // Initialize device
  CubDebugExit(args.DeviceInit());

  // Run tests
  Test<1024, 1, BLOCK_REDUCE_RAKING>();
  Test<512, 2, BLOCK_REDUCE_RAKING>();
  Test<256, 4, BLOCK_REDUCE_RAKING>();
  Test<128, 8, BLOCK_REDUCE_RAKING>();
  Test<64, 16, BLOCK_REDUCE_RAKING>();
  Test<32, 32, BLOCK_REDUCE_RAKING>();
  Test<16, 64, BLOCK_REDUCE_RAKING>();

  printf("-------------\n");

  Test<1024, 1, BLOCK_REDUCE_WARP_REDUCTIONS>();
  Test<512, 2, BLOCK_REDUCE_WARP_REDUCTIONS>();
  Test<256, 4, BLOCK_REDUCE_WARP_REDUCTIONS>();
  Test<128, 8, BLOCK_REDUCE_WARP_REDUCTIONS>();
  Test<64, 16, BLOCK_REDUCE_WARP_REDUCTIONS>();
  Test<32, 32, BLOCK_REDUCE_WARP_REDUCTIONS>();
  Test<16, 64, BLOCK_REDUCE_WARP_REDUCTIONS>();

  return 0;
}
